Hi! Please work through this short intro before the workshop on, especially the package installation part - work until the line which says This is the end of the intro - we’ll continue from there on together. If you come prepared, you’ll be able to follow the workshop easily; if you don’t work through this short intro, your experience will not be very good.
# Run this code block; it should start throwing messages about installing a bunch of stuff in the console. This will take some time but only needs to be done once!
p=c("tidyverse","plotly","quanteda","umap","shadowtext","text2vec","doc2vec","ggplot2movies","cld3","writexl","textutils","readxl","xml2","jsonlite","readtext","xmlconvert");install.packages(p);x=p%in%rownames(installed.packages());if(all(x)){print("All packages installed successfully!")}else{print(paste("Failed to install:", paste(p[!x]), ", try again and make sure you have internet connection."))}
# If it asks "Do you want to install from sources the package which needs compilation?" just go for "no".
This is the end of the intro
# Load the necessary packages - this needs to be done every time you restart R
# To run the entire code block here, click the little green triangle > in the top right corner of the code block. Do that now.
# Or put your cursor on the first line of the code and press CTRL+ENTER (CMD+ENTER)
suppressWarnings(suppressMessages({ # -> Run this! (it might take a moment)
library(tidyverse) # includes ggplot2, dplyr, tibble, readr
library(plotly) # interactive plots
library(quanteda) # a corpus pkg & its addons
library(umap) # for dimension reduction
library(shadowtext) # ggplot addon
library(text2vec) # NLP
library(doc2vec) # embeddings
library(ggplot2movies) # datasets
library(cld3) # language detection
library(writexl) # for working with various formats
library(textutils)
library(readxl)
library(xml2)
library(jsonlite)
library(readtext)
library(xmlconvert)
ggplot()+annotate("text",x=0,y=0,label="Welcome!") # a little test
}))
Troubleshooting clipboard: if needed, describe your error there and then ask for assistance over the chat: https://hackmd.io/@andreskarjus/HyaRgdbxY/edit
Solutions clipboard: https://hackmd.io/@andreskarjus/SkTrzBZgK/edit
For this section, we’ll just use a little corpus that comes with the quanteda package, the inaugural speeches of US presidents.
library(quanteda) # a corpus linguistics package; we'll also make use of a dataset in it:
library(plotly) # for ggplotly
# Let's inspect the data first
data_corpus_inaugural # it's a quanteda corpus object; when called, also displays some metadata
## Corpus consisting of 59 documents and 4 docvars.
## 1789-Washington :
## "Fellow-Citizens of the Senate and of the House of Representa..."
##
## 1793-Washington :
## "Fellow citizens, I am again called upon by the voice of my c..."
##
## 1797-Adams :
## "When it was first perceived, in early times, that no middle ..."
##
## 1801-Jefferson :
## "Friends and Fellow Citizens: Called upon to undertake the du..."
##
## 1805-Jefferson :
## "Proceeding, fellow citizens, to that qualification which the..."
##
## 1809-Madison :
## "Unwilling to depart from examples of the most revered author..."
##
## [ reached max_ndoc ... 53 more documents ]
head(tokens(data_corpus_inaugural[[1]])) # the first words of the first speech
## Tokens consisting of 1 document.
## text1 :
## [1] "Fellow-Citizens" "of" "the" "Senate"
## [5] "and" "of" "the" "House"
## [9] "of" "Representatives" ":" "Among"
## [ ... and 1,525 more ]
head(summary(data_corpus_inaugural)) # quanteda has a summary function for its corpus objects
## Text Types Tokens Sentences Year President FirstName
## 1 1789-Washington 625 1537 23 1789 Washington George
## 2 1793-Washington 96 147 4 1793 Washington George
## 3 1797-Adams 826 2577 37 1797 Adams John
## 4 1801-Jefferson 717 1923 41 1801 Jefferson Thomas
## 5 1805-Jefferson 804 2380 45 1805 Jefferson Thomas
## 6 1809-Madison 535 1261 21 1809 Madison James
## Party
## 1 none
## 2 none
## 3 Federalist
## 4 Democratic-Republican
## 5 Democratic-Republican
## 6 Democratic-Republican
sum(summary(data_corpus_inaugural)$Tokens) # total number of words
## [1] 151536
# Let's record that output as an object for later use:
metadata = summary(data_corpus_inaugural)
# Let's plot the lengths of the speeches over time:
ggplot(metadata, aes(x=Year, y=Tokens)) +
geom_point()+
theme_minimal() +
NULL
geom_line() layergeom_text(aes(label=President), nudge_y = 100, angle=90, hjust=0 )x = ggplot(metadata, aes... will do), and call ggplotly(x) on that object (this is from the plotly package, which we already loaded in the beginning).Using the content of the speeches, we could also map them out in some n-dimensional space. Let’s give that a try.
library(umap) # dimension reduction package we'll be using here
library(shadowtext) # ggplot addon for shaded labels
# Let's parse the corpus and distill it into a doc-term matrix
parsed =
data_corpus_inaugural %>%
tokens(remove_numbers = T, remove_punct = T) %>% # tokenize
dfm(tolower = T) %>% # into dfm (also lowercase)
dfm_remove(c(stopwords('english'), # remove stopwords
"can", "may", "every", "*ly"), # also remove -ly adverbs
valuetype="glob") %>%
dfm_wordstem() %>% # also stem: remove suffixes to get a more compact/comparable lexicon
dfm_tfidf()
# this also applies TF-IDF (term frequency - inverse document frequency) weighting to our matrix; this lowers the importance of common words but increases the importance of words which distinguish documents.
parsed[1:3, 1:7] # matrix with weights instead of frequencies, and words are stemmed
## Document-feature matrix of: 3 documents, 7 features (42.86% sparse) and 4 docvars.
## features
## docs fellow-citizen senat hous repres among
## 1789-Washington 0.4920984 0.624724 1.249448 0.8972654 0.1373836
## 1793-Washington 0 0 0 0 0
## 1797-Adams 1.4762952 0.624724 1.874172 1.3458982 0.5495342
## features
## docs vicissitud incid
## 1789-Washington 0.9927008 0.925754
## 1793-Washington 0 0
## 1797-Adams 0 0
ncol(parsed) # lexicon size
## [1] 5303
# Quick comparison, how big would the lexicon be if we didn't do any cleaning, lowercasing and stemming
data_corpus_inaugural %>% tokens() %>% dfm() %>% ncol()
## [1] 9439
# get 2D coordinates from the UMAP dimension reduction algorithm and add the metadata
coords = umap(as.matrix(parsed))$layout %>%
as.data.frame() %>%
mutate(year = summary(data_corpus_inaugural)$Year,
speech=summary(data_corpus_inaugural)$Text
)
# Plot:
ggplot(coords, aes(V1, V2, label=speech, color=year))+
geom_point()+
geom_text(hjust=-0.1, size=3)+
scale_color_viridis_c()+
theme_dark()+
theme(axis.title=element_blank(),
legend.position = "none")+
NULL
# We could also plot all the words according to their usage frequencies over time; let's rerun the pipeline once more
parsed_words =
data_corpus_inaugural %>%
tokens(remove_numbers = T, remove_punct = T) %>%
dfm(tolower = T) %>%
dfm_wordstem() %>%
dfm_remove(c(stopwords('english'),
"can", "may", "every", "*ly"), valuetype="glob") %>%
dfm_trim(min_termfreq = 10) %>% # exclude very low frequency words
dfm_smooth() %>% # smoothing, for later log-transform
dfm_weight("prop") %>% # normalize by document (which have different lengths)
dfm_weight(scheme="logcount", force=T) %>% # transform frequencies to log scale
t() %>% as.matrix() # transpose (since we're interested in words this time)
# why the log scaling? because words in a text are not distributed uniformly, but rather according to what's referred to as the Zipf's law - there are always a few very frequent words and a long tail of very infrequent words.
# Let's run another UMAP model (this may take a bit longer)
coords2 = umap(parsed_words)$layout %>% as.data.frame()
# add the words and years (and some extra info we'll use later;)
metadata = summary(data_corpus_inaugural)
coord_metadata = coords2 %>%
mutate(word=rownames(.)) %>%
mutate(maxvalue = apply(parsed_words, 1, max)) %>%
mutate(maxyear = metadata$Year[apply(
parsed_words, 1, function(x) which.max(x))]
) %>%
mutate(topspeeches = apply(
parsed_words, 1, function(x) metadata$Text[head(order(x, decreasing = T),3)] %>%
paste(collapse=", ") )
) %>%
mutate(topspeeches = paste(word, "\n", topspeeches))
# the last two calls fetch the year where a given word is the most frequent
# Let's plot: it places words that are frequent in similar years closer
# we'll use geom_shadowtext from the shadowtext package for shaded labels
# Dark blue = most common in earlier times, lighter = most frequent year in recent times
# Light word among dark words: something that was likely used back in the day, and now again
# Dark gray: words with no particularly outstanding year (set to NA above).
ggplot(coord_metadata, aes(V1, V2, color=maxyear, label=word))+
geom_shadowtext(size=3, bg.color="white")+
scale_color_viridis_c(option="E", end = 0.9, na.value = "gray10")+
labs(color="most\nfrequent\nin year...")+
NULL
# That is a looooot of words though, and quite hard to read. One solution would be to only plot a sample of the words:
# this groups the data by decades and samples top words from each group,
# and also sets the size to be the maximum log frequency value of the word, so more important words are highlighted.
# While we're at it, why not try a different color scheme too.
ggplot(data=coord_metadata %>% group_by(round(maxyear/10)) %>% sample_n(3),
aes(V1, V2, color=maxyear, label=word, size=maxvalue))+
geom_point(data=coord_metadata, alpha=0.3)+
geom_shadowtext(hjust=-0.1, bg.color="black")+
scale_color_viridis_c(option="E", end = 0.9, na.value = "gray15")+
scale_size(guide="none")+
labs(color="most\nfrequent\nin year...")+
theme_void()+
theme(plot.background = element_rect("black"),
panel.background = element_rect("black"),
legend.text = element_text(color="gray")
)+
NULL
# Wouldn't it be nice if the plot showed just some labels, but then you could hover with your mouse to see more labels...?
library(plotly) # for doing interactive plots
# plotly can be used to create the same sorts of plots as you've done with the ggplot() function, except interactive.
# It can be used to create interactive plots from scratch, or to convert (most) ggplots.
# Let's re-do the same plot as above, but save it as an object
g = ggplot(data=coord_metadata %>% group_by(round(maxyear/10)) %>% sample_n(3),
aes(V1, V2, color=maxyear, label=word, size=maxvalue, text=topspeeches))+
geom_point(data=coord_metadata, alpha=0.3)+
geom_text(hjust=-0.1)+
scale_color_viridis_c(option="E", end = 0.9, na.value = "gray15")+
scale_size(guide="none")+
labs(color="most\nfrequent\nin year...")+
theme_void()+
theme(plot.background = element_rect("black"),
panel.background = element_rect("black"),
legend.text = element_text(color="gray")
)+
NULL
# note the extra parameter text=topspeeches - this records the top 3 speeches where this word is most frequent, and can be passed on to plotly.
ggplotly(g, tooltip="text") # this is the ggplot -> plotly converter function
# explore a bit; some light points among dark clouds and vice versa are quite interesting.
# troubleshooting: on some older computers with certain graphics hardware, this might not display: in that case click the little "show in new window" icon (arrow and box) top right of the plotting area to open in a browser.
Another way to compare all variables to all variables all at once is to use a heatmap. Unlike dimension reduction such as UMAP, no information is reduced or compressed, but the interpretation is perhaps not immediately as intuitive as reading a scatterplot of a dimension reduction.
library(quanteda)
library(quanteda.textstats)
# turn the corpus into a doc-term matrix again
docterm = data_corpus_inaugural %>%
tokens(remove_punct = T, remove_symbols = T, remove_numbers = T) %>%
dfm(tolower = T) %>%
dfm_remove(stopwords()) %>%
dfm_wordstem() %>%
dfm_tfidf()
# Let's have a quick look at the tfidf scores matrix:
docterm[1:5, 1:5]
## Document-feature matrix of: 5 documents, 5 features (44.00% sparse) and 4 docvars.
## features
## docs fellow-citizen senat hous repres among
## 1789-Washington 0.4920984 0.624724 1.249448 0.8972654 0.1373836
## 1793-Washington 0 0 0 0 0
## 1797-Adams 1.4762952 0.624724 1.874172 1.3458982 0.5495342
## 1801-Jefferson 0.9841968 0 0 0.4486327 0.1373836
## 1805-Jefferson 0 0 0 0 0.9616849
# Let's calculate the cosine similarity of the texts (speeches), as similarity between the vectors across words
docsim = textstat_simil(docterm, method="cosine"); diag(docsim)=NA
# This yields a pretty large matrix (n*n documents), which would be pretty hard to comprehend just by staring at it...
dim(docsim)
## [1] 59 59
docsim[1:5, 1:5] # first five
## 5 x 5 Matrix of class "dgeMatrix"
## 1789-Washington 1793-Washington 1797-Adams 1801-Jefferson
## 1789-Washington NA 0.06873813 0.15011458 0.11916878
## 1793-Washington 0.06873813 NA 0.04812951 0.04017544
## 1797-Adams 0.15011458 0.04812951 NA 0.14184338
## 1801-Jefferson 0.11916878 0.04017544 0.14184338 NA
## 1805-Jefferson 0.12188694 0.05094464 0.12630616 0.17043868
## 1805-Jefferson
## 1789-Washington 0.12188694
## 1793-Washington 0.05094464
## 1797-Adams 0.12630616
## 1801-Jefferson 0.17043868
## 1805-Jefferson NA
# Let's turn it into a heatmap visualization instead. One extra step though: this is a "wide" format matrix, while ggplot expects data in a "long" format, so let's convert first.
docsim_long = as.data.frame.table(as.matrix(docsim), responseName = "similarity") # run this first
ggplot(docsim_long, aes(Var1, Var2, fill=similarity)) +
# add things between here...
geom_tile() +
theme_bw() +
# ...and here.
NULL
scale_fill_viridis_c(na.value="white", option="D") (feel free to try other viridis options “A”-“E”)theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5)) - and always make sure the modifying theme() comes after theme preset commands like theme_bw()While we’re at it, let’s try to probe into the corpus of speeches and use some more interactive plotting tools to visualize it.
library(tidyr) # part of tidyverse; used here to gather data into long format
library(quanteda) # more of that
library(text2vec) # that's new; will use for topic models
# These lines of codes will create a document-term matrix that we can use to extract the top terms (after removing stopwords) from the speeches, but also to train a topic model and visualise its contents.
docterm2 = docterm = data_corpus_inaugural %>%
tokens(remove_punct = T, remove_symbols = T, remove_numbers = T) %>%
dfm(tolower = T) %>%
dfm_remove(stopwords()) %>%
dfm_wordstem() # no tfidf this time, as LDA works with counts
# Quick look into what's in there:
docterm2
## Document-feature matrix of: 59 documents, 5,385 features (89.22% sparse) and 4 docvars.
## features
## docs fellow-citizen senat hous repres among vicissitud incid life
## 1789-Washington 1 1 2 2 1 1 1 1
## 1793-Washington 0 0 0 0 0 0 0 0
## 1797-Adams 3 1 3 3 4 0 0 2
## 1801-Jefferson 2 0 0 1 1 0 0 1
## 1805-Jefferson 0 0 0 0 7 0 0 2
## 1809-Madison 1 0 0 1 0 1 0 1
## features
## docs event fill
## 1789-Washington 2 1
## 1793-Washington 0 0
## 1797-Adams 0 0
## 1801-Jefferson 0 0
## 1805-Jefferson 1 0
## 1809-Madison 0 1
## [ reached max_ndoc ... 53 more documents, reached max_nfeat ... 5,375 more features ]
docterm2[1:3, 1:5] # each document as a vector of words
## Document-feature matrix of: 3 documents, 5 features (33.33% sparse) and 4 docvars.
## features
## docs fellow-citizen senat hous repres among
## 1789-Washington 1 1 2 2 1
## 1793-Washington 0 0 0 0 0
## 1797-Adams 3 1 3 3 4
# Let's train a quick topic model with 5 topics:
lda = LDA$new(n_topics = 5); topicmodel=lda$fit_transform(docterm)
## INFO [13:15:35.471] early stopping at 40 iteration
## INFO [13:15:36.285] early stopping at 20 iteration
# the object is just a big matrix d*t
topicmodel[1:2, ] # columns are topics, documents are probability distributions
## [,1] [,2] [,3] [,4] [,5]
## 1789-Washington 0.05444785 0.5067485 0.04524540 0.2283742 0.1651840
## 1793-Washington 0.08225806 0.3677419 0.09193548 0.2032258 0.2548387
sum(topicmodel[1, ])
## [1] 1
# extract keywords from each topic, paste together into vectors
topterms = lda$get_top_words(n = 10, lambda = 0.3) %>%
apply(2,paste,collapse=" ")
topterms # that's a bit more human-readable
## [1] "us new america let today god american togeth face centuri"
## [2] "constitut state union power govern general object foreign opinion institut"
## [3] "life freedom peopl human seek believ know ideal men land"
## [4] "made republ intern express wish maintain can hold may defens"
## [5] "law congress polici upon practic promot support reason view import"
# cast the document-topic distribution as long data:
tidymodel = as.data.frame(topicmodel) %>%
rownames_to_column("speech") %>%
gather("topic","value", V1:V5) %>%
mutate(topic=factor(topic, labels = topterms))
# top keywords for each topic are plotted in the legend:
ggplot(tidymodel, aes(x=speech, y=value, fill=topic)) +
geom_bar(position="stack", stat="identity") +
guides(fill=guide_legend(nrow=5,title.position="top")) +
coord_cartesian(expand = 0) +
theme(axis.text = element_text(angle=45, hjust=1),
legend.position = "top",
legend.justification = "left"
)+
NULL
scale_fill_manual(values=c("gold", "skyblue","blue","forestgreen","darkred" ))The corpus we used here as a toy example came with an R package. If you’re working with your actual data, you will need to import it into R somehow. Corpora exist in quite different formats though. Here, we will generate examples of these formats from the inaugural speeches we’re already familiar with, and then look into importing and working with these formats.
Generate examples - make sure to run this, otherwise nothing else in this section will work
library(quanteda)
library(writexl)
library(jsonlite)
library(readr)
library(xml2)
library(xmlconvert)
# The files will be saved in this folder, the default "working directory".
getwd() # if you'd rather use a different folder, set it using: setwd("path/to/folder")
# But we'll create a subfolder for all the examples so you can easily find them:
dir.create("corpus_examples")
dir.exists("corpus_examples") # did it work? (should say TRUE) If it didn't, ask for help, or manually create a folder called "corpus_examples" in your working directory.
#### Let's generate our example files:
# This will be needed a few times so let's create and object:
speeches_frame =
data_corpus_inaugural[1:3] %>%
as.list() %>%
lapply(function(x) paste(x) %>% gsub("\n", " ",.) ) %>%
{data.frame(summary(data_corpus_inaugural[1:3]), speech=unlist(.))}
## One big plain text file, 1 text per line:
data_corpus_inaugural[1:3] %>%
lapply(function(x) paste(x) %>% gsub("\n", " ",.) ) %>%
write_lines("corpus_examples/big_plaintext.txt")
# could also use base R writeLines(), but the readr function saves as unicode by default
## Multiple plain text files
sapply(1:3, function(x){
write_lines(speeches_frame$speech[x],
paste0("corpus_examples/small_plaintext_",x,".txt"));x
})
## Delimited files - CSV and TSV
speeches_frame %>% write_csv("corpus_examples/bigcsv.csv")
speeches_frame %>% write_delim("corpus_examples/bigtsv.txt", delim="\t")
## XLSX (yes, Excel; for Word docx, see the officer package)
speeches_frame %>% write_xlsx("corpus_examples/excel.xlsx")
## JSON
speeches_frame %>% write_json("corpus_examples/JSON.json" )
## XML
speeches_frame %>% df_to_xml() %>%
write_xml("corpus_examples/XML.xml")
## HTML
lapply(1:3, function(x) paste("\n<h2>",speeches_frame$Text[x],"</h2>\n<p>", speeches_frame$speech[x],"</p>")) %>%
unlist() %>%
paste(collapse="\n<br>") %>%
{paste(
"<!DOCTYPE html>
<head>
</head>
<body>
<h1>An example corpus</h1>",
.,
"</body>
</html>")} %>%
write_lines( "corpus_examples/web.html" )
But what if the corpus is really massive, and larger than would fit into memory? For example if the corpus is 10GB and you only got 8GB of RAM? There’s easy solutions for that! All the readr package functions have a lazy loading option (won’t immediately load everything), there’s the data.table package which has fread, optimized for large files; and text2vec has iterator functions for building e.g. doc-term matrices that iterate through files one by one, instead of importing everything at once (assuming the large corpus is split into files).
In the following, format-specific import functions are provided, as well as examples using readtext(), where applicable. The readtext package, an add-on to quanteda, aims to provide a swiss knife importing solution for most common corpus types.
library(readtext) # quanteda companion package
library(readr)
library(readxl)
library(jsonlite)
# Let's have a look at the different file imports; we'll be just calling the import functions, not saving the results as objects, to the results will just show down in the Console (make sure it's not minimized).
## One big plain text file, 1 text per line:
read_lines("corpus_examples/big_plaintext.txt") %>%
substr(1, 100)
# feeding the result into the substring function so it doesn't flood your Console
# readtext also works, with some extra segmentation:
readtext("corpus_examples/big_plaintext.txt") %>%
corpus() %>%
corpus_segment("\n", pattern_position = "after", extract_pattern = F)
## Multiple plain text files
# the same function works with a list of files
list.files("corpus_examples", full.names = T, pattern = "small_plaintext") %>%
read_lines() %>%
substr(1, 100)
# readtext:
list.files("corpus_examples", full.names = T, pattern = "small_plaintext") %>%
readtext()
## Delimited files - CSV and TSV
read_csv("corpus_examples/bigcsv.csv")
read_delim("corpus_examples/bigtsv.txt", delim = "\t")
# readtext:
readtext("corpus_examples/bigcsv.csv")
## XLSX
read_excel("corpus_examples/excel.xlsx")
# or:
readtext("corpus_examples/excel.xlsx")
## JSON
read_json("corpus_examples/JSON.json") %>% # gives a list
lapply(substr, 1, 100) # just shortening again
read_json("corpus_examples/JSON.json", simplifyVector = T ) %>% tibble()
# or:
readtext("corpus_examples/JSON.json", text_field = "speech")
## XML
read_xml("corpus_examples/XML.xml") # just reads and parses xml
read_xml("corpus_examples/XML.xml") %>%
xml_find_all(xpath = "//speech") # extract speeches too
# or:
readtext("corpus_examples/XML.xml", text_field = "speech")
## HTML (a whole topic on its own though)
# We can use the same xml2 package to extract tags from a html page
read_html("corpus_examples/web.html") %>%
xml_find_all(xpath = "//p")
Important part: you generally want to work with Unicode (UTF-8) encoding whenever possible. If a corpus comes in another encoding, then try to figure it out, and specify the encoding as a parameter in whatever import function you’re using (there’s usually a parameter for that, see help files), otherwise many characters are bound to break.
Different optiongs depending on your research questions etc, but one way is to detect the language and treat each separately.
library(cld3) # language detection package
library(ggplot2movies) # an imdb movies data subset
# Let's create another simualted corpus
titles = movies %>%
filter(nchar(title)>80) %>%
select(title)
titles # some titles
## # A tibble: 20 x 1
## title
## <chr>
## 1 Candid Camera Story (Very Candid) of the Metro-Goldwyn-Mayer Pictures 1937 C~
## 2 Chronicle History of King Henry the Fift with His Battell Fought at Agincour~
## 3 Daehakno-yeseo maechoon-hadaka tomaksalhae danghan yeogosaeng ajik Daehakno-~
## 4 Easy Riders, Raging Bulls: How the Sex, Drugs and Rock 'N' Roll Generation S~
## 5 Epic Tale of Kalesius and Clotho: A Meditation on the Impossibility of Roman~
## 6 Fatto di sangue fra due uomini per causa di una vedova - si sospettano moven~
## 7 Film d'amore e d'anarchia, ovvero 'stamattina alle 10 in via dei Fiori nella~
## 8 I Killed My Lesbian Wife, Hung Her on a Meat Hook, and Now I Have a Three-Pi~
## 9 Incredibly Strange Creatures Who Stopped Living and Became Mixed-Up Zombies!~
## 10 Long Strange Trip, or The Writer, the Naked Girl, and the Guy with a Hole in~
## 11 M.A. Numminen Turns Rabbit - The Universal Declaration of the Rights of the ~
## 12 Man Who Might Have Been: An Inquiry Into the Life and Death of Herbert Norma~
## 13 Man with the Smallest Penis in Existence and the Electron Microscope Technic~
## 14 Miyazawa Kenji - Ginga-tetsudo no yoru/Kokto de la galaksia fervojo de Miyaz~
## 15 Personal History, Adventures, Experience, and Observation of David Copperfie~
## 16 Riusciranno i nostri eroi a ritrovare l'amico misteriosamente scomparso in A~
## 17 Rough Sketch of a Proposed Film Dealing with the Powers of Ten and the Relat~
## 18 Saga of the Viking Women and Their Voyage to the Waters of the Great Sea Ser~
## 19 Those Magnificent Men in Their Flying Machines, or How I Flew from London to~
## 20 What I Want My Words to Do to You: Voices From Inside a Women's Maximum Secu~
title_lang = titles %>%
mutate(language = detect_language(title)) %>% # cld3 function
select(language, title)
title_lang
## # A tibble: 20 x 2
## language title
## <chr> <chr>
## 1 en Candid Camera Story (Very Candid) of the Metro-Goldwyn-Mayer Pictur~
## 2 en Chronicle History of King Henry the Fift with His Battell Fought at~
## 3 sn Daehakno-yeseo maechoon-hadaka tomaksalhae danghan yeogosaeng ajik ~
## 4 en Easy Riders, Raging Bulls: How the Sex, Drugs and Rock 'N' Roll Gen~
## 5 en Epic Tale of Kalesius and Clotho: A Meditation on the Impossibility~
## 6 it Fatto di sangue fra due uomini per causa di una vedova - si sospett~
## 7 it Film d'amore e d'anarchia, ovvero 'stamattina alle 10 in via dei Fi~
## 8 en I Killed My Lesbian Wife, Hung Her on a Meat Hook, and Now I Have a~
## 9 en Incredibly Strange Creatures Who Stopped Living and Became Mixed-Up~
## 10 en Long Strange Trip, or The Writer, the Naked Girl, and the Guy with ~
## 11 en M.A. Numminen Turns Rabbit - The Universal Declaration of the Right~
## 12 en Man Who Might Have Been: An Inquiry Into the Life and Death of Herb~
## 13 en Man with the Smallest Penis in Existence and the Electron Microscop~
## 14 ja-Latn Miyazawa Kenji - Ginga-tetsudo no yoru/Kokto de la galaksia fervojo~
## 15 en Personal History, Adventures, Experience, and Observation of David ~
## 16 it Riusciranno i nostri eroi a ritrovare l'amico misteriosamente scomp~
## 17 en Rough Sketch of a Proposed Film Dealing with the Powers of Ten and ~
## 18 en Saga of the Viking Women and Their Voyage to the Waters of the Grea~
## 19 en Those Magnificent Men in Their Flying Machines, or How I Flew from ~
## 20 en What I Want My Words to Do to You: Voices From Inside a Women's Max~
This can happen when somebody has accidentally converted between unicode and non-unicode encodings improperly. One way to deal with it is to, well, try to fix it.
library(stringr) # part if tidyverse
# Let's create a messed up string
messy = "Mu hõljuk on angerjaid täis. Mans gliseris ir pilns ar zuŠiem"
# manual fix
str_replace_all(messy, c("õ" = "õ",
"ä" = "ä",
"Š" = "š"
))
## [1] "Mu hõljuk on angerjaid täis. Mans gliseris ir pilns ar zušiem"
# Hint: if you get unexpected results from string operations or string comparisons, then look into changing the Locale setting in R - it's a can of worms we're not going to get into today though. For example, this gives me TRUE on my English-language locale, but if you have the Latvian locale set, it should corretly say FALSE (recognizing these as different letters)
Sys.getlocale()
## [1] "LC_COLLATE=English_United Kingdom.1252;LC_CTYPE=English_United Kingdom.1252;LC_MONETARY=English_United Kingdom.1252;LC_NUMERIC=C;LC_TIME=English_United Kingdom.1252"
"ī" == "i" # should be False
## [1] TRUE
If it’s all just encoded html entities, fixing them all manually is suboptimal, as there’s packages for that.
library(textutils)
entities = HTMLencode("Mu hõljuk on angerjaid täis in Latvian is: 'mans gliseris ir pilns ar zušiem'")
entities
## [1] "Mu hõljuk on angerjaid täis in Latvian is: 'mans gliseris ir pilns ar zušiem'"
HTMLdecode(entities)
## [1] "Mu hõljuk on angerjaid täis in Latvian is: 'mans gliseris ir pilns ar zušiem'"
This is probably a good time to look into regex, or regular expressions. A regex defines a string pattern to be used for searching in text, using a limited set of special operators.
Repetion: ? = The preceding item is optional and will be matched at most once. * = The preceding item will be matched zero or more times + = The preceding item will be matched one or more times. {n,m} = The preceding item is matched at least n times, but not more than m times.
Sets: [ab] = Define set of characters, matches any; [ab] matches a or b; [ab]+ matches a, aaa, b, bbb, abab, etc. [^a] = Matches anything except a. [0-9] = Matches all numbers [a-z] = Matches all lowercase letters (locale-specific) (a|b) = Matches a or b . = Matches anything (.* = matches any number of anythings, greedy) [[:punct:]] = Matches all (most) punctuation symbols [[:space:]] = Matches all whitespace characters (space, newline)
Other: ^ = Matches beginning of string. $ = Matches end of string.
# In base R, there's the grep function to find, and gsub to find+replace. The stringr package provides alternatived and additions to these functionalities; quanteda has a kwic function too.
grep("walk(ed|s)", c("I walked", "she walks")) # matches both strings
## [1] 1 2
grep("walk.*", c("I walked", "she walks")) # .* greedily matches everything
## [1] 1 2
grep("[0-9]+", c("1", "123", "123abc", "abc")) # + requires at least 1 match
## [1] 1 2 3
grep("[0-9]*", "abc") # matches, because * = zero or more
## [1] 1
Let’s create a sentence corpus to work with
library(quanteda)
library(stringr)
sents = data_corpus_inaugural %>%
tokens("sentence") %>% as.list() %>%
tibble(sentence=.,
summary(data_corpus_inaugural) %>%
select(Text, Year, President, Party)
) %>%
unnest(cols = sentence)
sents[1:3,] # we now have a data frame where each sentence has its own row, but all of them retain the metadata (thanks to the unnest function)
grep("memor(y|ies)", sents$sentence, ignore.case = T, value=T)
grep("I would like", sents$sentence, ignore.case = T, value=T)
kwic(tokens(sents$sentence), pattern="hereafter", valuetype = "regex", window=2 )
# Can we count mentions of men vs women?
kwic(tokens(sents$sentence), pattern="men", valuetype = "regex", window=2 ) # hmm this won't work, as regex matches don't care about surrounding characters
kwic(tokens(sents$sentence), pattern="^men$", valuetype = "regex", window=2 ) # this works because words are tokenized beforehand, so we can easily use the beginning & end operator
kwic(tokens(sents$sentence), pattern="^women$", valuetype = "regex", window=2 )
# grepl provides a logical vector instead of incdices or values, which works for filtering:
sents %>% filter(grepl("Vietnam", sentence))
sents %>% filter(grepl("Europe", sentence))
# Quantify?
sents %>%
mutate(blessing = str_extract(sentence, "bless [a-zA-Z]+")) %>% # uses stringr
filter(!is.na(blessing)) %>%
ggplot(aes(y=blessing))+
geom_bar()
sents %>%
mutate(gender=case_when(
grepl(" (he|him|his)[ [:punct:]]", sentence) ~ "M",
grepl(" (she|her|hers)[ [:punct:]]", sentence) ~ "F",
)) %>%
filter(!is.na(gender)) %>%
ggplot(aes(y=gender))+
geom_bar()
sents %>%
mutate(gender=case_when(
grepl(" (he|him|his)[ [:punct:]]", sentence) ~ "M",
grepl(" (she|her|hers)[ [:punct:]]", sentence) ~ "F",
)) %>%
filter(!is.na(gender)) %>%
group_by(gender, Year) %>%
count() %>%
ggplot(aes(x=Year,y=n, color=gender))+
geom_line()+
geom_point()
# Try things out here
Often there’s not enough text data available to train your own topic model or word embedding; fortunately, there’s an ever growing number of pretrained models available (the keyword is “transfer learning”). We’ll look at one easy and fast example here, using fasttext pretrained word embeddings in a doc2vec sentence vectorization model. This is the earlier, type-based word embedding paradigm (fasttext is an improvement on word2vec), precursor to the token-based language models like BERT.
library(text2vec) # provides cosine similarity function
library(readr) # for import
library(quanteda) # for tokenization
library(doc2vec) # doc2vec model training
vecs = read_delim("https://raw.githubusercontent.com/andreskarjus/artofthefigure/master/riga2022/mini_fasttext.csv", col_names = F, progress = T, lazy = F) %>%
column_to_rownames(var = "X1") %>% as.matrix()
dim(vecs) # It's a reduced version of the 2-billion word fasttext English model
vecs[1:3, 1:4] # rows are words
# Let's extract the closing words of the speeches, last 15 words (~length of your avergae tweet)
lastwords = data_corpus_inaugural %>%
tokens(remove_punct = T) %>%
tokens_tolower() %>% as.list() %>%
lapply(function(x) x[(length(x)-15):length(x)] %>% paste(collapse=" ") ) %>%
unlist() %>%
tibble(text=., summary(data_corpus_inaugural) %>% rename(doc_id=Text))
# (and append the metadata of the corpus to it)
# Train the doc2vec (or paragraph2vec) model, using the pretrained word embeddings
vecmodel = paragraph2vec(lastwords %>% select(doc_id, text),
embeddings = vecs1, dim=300,
min_count = 0, iter=0)
sentvecs = as.matrix(vecmodel, which = "docs") # extracts the new sentence vectors
# Word similarity in the original embeddings
sim2(vecs, vecs["president",,drop=F])[,1] %>% sort(decreasing = T) %>% head()
sim2(vecs, vecs["money",,drop=F])[,1] %>% sort(decreasing = T) %>% head()
# Using the sentence embeddings, let's explore the data
# Who's speech is most similar to Trumps closing words?
sim2(sentvecs, sentvecs[lastwords$doc_id=="2017-Trump",,drop=F ])[,1] %>%
order(decreasing = T) %>% lastwords[.,] %>% head(3) %>% select(doc_id, text)
# Which are most dissimilar?
sim2(sentvecs, sentvecs[lastwords$doc_id=="2017-Trump",,drop=F ])[,1] %>%
order(decreasing = F) %>% lastwords[.,] %>% head(3) %>% select(doc_id, text)
# Which speeches close with religous content? (here using the word vector from the original embeddings to find similar sentence embeddings - since the doc2vec model wasn't further trained on new word contexts (iter=0), both embeddings remain in the same space)
sim2(sentvecs, vecs["religion",,drop=F])[,1] %>%
order(decreasing = T) %>% lastwords[.,] %>% head(3) %>% select(doc_id, text)
# Which don't?
sim2(sentvecs, vecs["religion",,drop=F])[,1] %>%
order(decreasing = F) %>% lastwords[.,] %>% head(3) %>% select(doc_id, text)
# We could also do a semantic search for concepts, word by word, instead of the sentence average.
# Tokenize the sentences:
lastwords_tok = lastwords %>%
mutate(tok=tokens(text) %>% as.list())
# Define a little semantic search function that tries to maximize the semantic similarity by individual terms (just run this code block)
semsearch = function(term, dat=lastwords_tok, vecs1=vecs, n=3){
if(term %in% rownames(vecs1)){
sapply(dat$tok, function(x)
sim2(vecs1[intersect(x, rownames(vecs1)),,drop=F], vecs1[term,,drop=F]) %>%
.[lower.tri(.)] %>% max()) %>%
order(decreasing = T) %>% dat[.,] %>%
head(n) %>% select(doc_id, text)
} else { stop("Term not in embedding model, try something else") }
}
# Let's see
semsearch("peace")
semsearch("friendship")
(if we have time)